This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
## Deleting all variables
rm(list=ls())
## Install and load libraries
library(tm)
library(dplyr)
library(stringi)
library(stringr)
library(wordcloud)
library(ggplot2)
library(plotly)
library(SnowballC) ## Snowball stemmers based on the C libstemmer UTF-8 library
library(corrplot)
library(heatmaply) ## For drawing dendrogram along with correlation
## Read the tweets and create a dataframe
modi.tweets <-
read.csv(
"/Users/ritesh/pad-datascience/R/unstructureData/data/narendramodi_tweets.csv",
stringsAsFactors = F)
## Convert the text column to character
modi.tweets$text <- as.character(modi.tweets$text)
## removed the spacial character excluding '_' and '#'
modi.tweets$text_transformed <- gsub("[^A-Za-z0-9///' ]", " ", modi.tweets$text)
## Count the character of the tweets and add as a column 'character_count'
modi.tweets$character_count <- lapply(modi.tweets$text_transformed, nchar)
modi.tweets$character_count <- as.numeric(modi.tweets$character_count)
## Convert the created_at column from character array to date
modi.tweets$created_at <- as.Date(modi.tweets$created_at)
## Subset only nov'16 tweets
modi.tweets.subset <- subset(modi.tweets, as.Date("2016-10-31") < created_at & as.Date("2016-12-01") > created_at)
#View(modi.tweets.subset)
names(modi.tweets)
## [1] "id" "retweets_count" "favorite_count"
## [4] "created_at" "text" "lang"
## [7] "retweeted" "followers_count" "friends_count"
## [10] "hashtags_count" "description" "location"
## [13] "background_image_url" "source" "text_transformed"
## [16] "character_count"
class(modi.tweets$created_at)
## [1] "Date"
## Draw Scatter Plot
modi.tweets.plot <-
ggplot(
modi.tweets.subset,
aes(x = favorite_count, y = retweets_count, size = character_count, fill = source)) +
geom_point(shape = 21) +
ggtitle("Narendra Modi Tweets for the month of Nov") +
labs(x = "Favourite Tweet Count", y = "Re-tweets Connt") +
theme(legend.position = "bottom", legend.direction = "horizontal")
ggplotly(modi.tweets.plot)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
plot(modi.tweets.plot)
Use text mining package to create a word cloud for modi’s tweet. While applying mapping, use SnowballC package to apply stemming
tweet_corpus = Corpus(VectorSource(modi.tweets$text_transformed))
## details are displayed with inspect()
inspect(tweet_corpus[[997]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 136
##
## Emphasised on the need to harness a spirit of entrepreneurship among Dalit youngsters so that they become job creators amp innovators
meta(tweet_corpus[[997]])
## author : character(0)
## datetimestamp: 2017-07-01 04:53:34
## description : character(0)
## heading : character(0)
## id : 997
## language : en
## origin : character(0)
## A character representation of a document is available via as.character() which is also used when inspecting a document:
lapply(tweet_corpus[1:2], as.character)
## [[1]]
## [1] "The President's address wonderfully encapsulated India's strengths aspirations potential amp the efforts towards TransformingIndia "
##
## [[2]]
## [1] "Rashtrapati Ji's address to both Houses of Parliament was in depth amp extensive Do hear https //t co/rdKQtjgNNx RashtrapatiBhvn"
############### - Transformations - #############
## Once we have a corpus we typically want to modify the documents in it, e.g., stemming, stopword removal, et cetera.
## Transformations are done via the tm_map() function which applies (maps) a function to all elements of the corpus.
## Convert corpus to lower case
tweet_corpus = tm_map(tweet_corpus, content_transformer(tolower))
## Remove Punctuation
tweet_corpus = tm_map(tweet_corpus, removePunctuation)
## Convert to Plain Text Format
tweet_corpus = tm_map(tweet_corpus, PlainTextDocument)
## To resolve the error 'simple_triplet_matrix 'i, j, v' different lengths'
tweet_corpus <- Corpus(VectorSource(tweet_corpus))
## Remove Stopwords
tweet_corpus = tm_map(tweet_corpus, removeWords, stopwords('english'))
## we will perform stemming. This means that all the words are converted
## to their stem (Ex: learning -> learn, walked -> walk, etc.). This will
## ensure that different forms of the word are converted to the same form
## and plotted only once in the wordcloud.
tweet_corpus = tm_map(tweet_corpus, stemDocument)
## If you want to remove the words ‘the’ and ‘this’, you can include them
## in the removeWords function as follows:
tweet_corpus <- tm_map(tweet_corpus, removeWords, c('the', 'this','https', 'http','amp', stopwords('english')))
## scale: This is used to indicate the range of sizes of the words.
## max.words and min.freq: These parameters are used to limit the number of words plotted.
## - max.words will plot the specified number of words and discard
## least frequent terms, whereas,
## - min.freq will discard all terms
## whose frequency is below the specified value.
## random.order: By setting this to FALSE, we make it so that the words with the highest
## frequency are plotted first. If we don’t set this, it will plot the words
## in a random order, and the highest frequency words may not necessarily appear in the center.
## rot.per: This value determines the fraction of words that are plotted vertically.
## colors: The default value is black. If you want to use different colors based on
## frequency, you can specify a vector of colors, or use one of the pre-defined color palettes.
## Create Word Cloud
wordcloud(tweet_corpus, max.words = 200, random.order = F, colors=palette(rainbow(6)))
Check the association between the top 10 hashtags.
a. Use corrplot to show the correlations
b. Using any other package, draw the correlation along with dendograms
Use corrplot to show the correlations
hashtags.tdm.df.10 <- hashtags.dtm.df[, top.hashtags]
# hashtags.tdm.df.10
cor_hashtags <- cor(hashtags.tdm.df.10)
corrplot(cor_hashtags,method="ellipse" )
corrplot.mixed(cor_hashtags)
Using any other package, draw the correlation along with dendrograms
## Correlation diagram woth dendrogam using 'heatmaply' package
heatmaply_cor(cor_hashtags)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`